tutorials/017 - Partition Projection.ipynb (900 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"[](https://github.com/aws/aws-sdk-pandas)\n",
"\n",
"# 17 - Partition Projection\n",
"\n",
"https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import getpass\n",
"from datetime import datetime\n",
"\n",
"import pandas as pd\n",
"\n",
"import awswrangler as wr"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Enter your bucket name:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ···········································\n"
]
}
],
"source": [
"bucket = getpass.getpass()"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Integer projection"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>value</th>\n",
" <th>year</th>\n",
" <th>month</th>\n",
" <th>day</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2019</td>\n",
" <td>10</td>\n",
" <td>25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2020</td>\n",
" <td>11</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>2021</td>\n",
" <td>12</td>\n",
" <td>27</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" value year month day\n",
"0 1 2019 10 25\n",
"1 2 2020 11 26\n",
"2 3 2021 12 27"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\"value\": [1, 2, 3], \"year\": [2019, 2020, 2021], \"month\": [10, 11, 12], \"day\": [25, 26, 27]})\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"wr.s3.to_parquet(\n",
" df=df,\n",
" path=f\"s3://{bucket}/table_integer/\",\n",
" dataset=True,\n",
" partition_cols=[\"year\", \"month\", \"day\"],\n",
" database=\"default\",\n",
" table=\"table_integer\",\n",
" athena_partition_projection_settings={\n",
" \"projection_types\": {\"year\": \"integer\", \"month\": \"integer\", \"day\": \"integer\"},\n",
" \"projection_ranges\": {\"year\": \"2000,2025\", \"month\": \"1,12\", \"day\": \"1,31\"},\n",
" },\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>value</th>\n",
" <th>year</th>\n",
" <th>month</th>\n",
" <th>day</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>2021</td>\n",
" <td>12</td>\n",
" <td>27</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2020</td>\n",
" <td>11</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2019</td>\n",
" <td>10</td>\n",
" <td>25</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" value year month day\n",
"0 3 2021 12 27\n",
"1 2 2020 11 26\n",
"2 1 2019 10 25"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.athena.read_sql_query(\"SELECT * FROM table_integer\", database=\"default\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Enum projection"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>value</th>\n",
" <th>city</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>São Paulo</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Tokio</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Seattle</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" value city\n",
"0 1 São Paulo\n",
"1 2 Tokio\n",
"2 3 Seattle"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(\n",
" {\n",
" \"value\": [1, 2, 3],\n",
" \"city\": [\"São Paulo\", \"Tokio\", \"Seattle\"],\n",
" }\n",
")\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"wr.s3.to_parquet(\n",
" df=df,\n",
" path=f\"s3://{bucket}/table_enum/\",\n",
" dataset=True,\n",
" partition_cols=[\"city\"],\n",
" database=\"default\",\n",
" table=\"table_enum\",\n",
" athena_partition_projection_settings={\n",
" \"projection_types\": {\n",
" \"city\": \"enum\",\n",
" },\n",
" \"projection_values\": {\"city\": \"São Paulo,Tokio,Seattle\"},\n",
" },\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>value</th>\n",
" <th>city</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>São Paulo</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>Seattle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>Tokio</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" value city\n",
"0 1 São Paulo\n",
"1 3 Seattle\n",
"2 2 Tokio"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.athena.read_sql_query(\"SELECT * FROM table_enum\", database=\"default\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Date projection"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>value</th>\n",
" <th>dt</th>\n",
" <th>ts</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2020-01-01</td>\n",
" <td>2020-01-01 00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2020-01-02</td>\n",
" <td>2020-01-01 00:00:01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>2020-01-03</td>\n",
" <td>2020-01-01 00:00:02</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" value dt ts\n",
"0 1 2020-01-01 2020-01-01 00:00:00\n",
"1 2 2020-01-02 2020-01-01 00:00:01\n",
"2 3 2020-01-03 2020-01-01 00:00:02"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def ts(x):\n",
" return datetime.strptime(x, \"%Y-%m-%d %H:%M:%S\")\n",
"\n",
"\n",
"def dt(x):\n",
" return datetime.strptime(x, \"%Y-%m-%d\").date()\n",
"\n",
"\n",
"df = pd.DataFrame(\n",
" {\n",
" \"value\": [1, 2, 3],\n",
" \"dt\": [dt(\"2020-01-01\"), dt(\"2020-01-02\"), dt(\"2020-01-03\")],\n",
" \"ts\": [ts(\"2020-01-01 00:00:00\"), ts(\"2020-01-01 00:00:01\"), ts(\"2020-01-01 00:00:02\")],\n",
" }\n",
")\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"wr.s3.to_parquet(\n",
" df=df,\n",
" path=f\"s3://{bucket}/table_date/\",\n",
" dataset=True,\n",
" partition_cols=[\"dt\", \"ts\"],\n",
" database=\"default\",\n",
" table=\"table_date\",\n",
" athena_partition_projection_settings={\n",
" \"projection_types\": {\n",
" \"dt\": \"date\",\n",
" \"ts\": \"date\",\n",
" },\n",
" \"projection_ranges\": {\"dt\": \"2020-01-01,2020-01-03\", \"ts\": \"2020-01-01 00:00:00,2020-01-01 00:00:02\"},\n",
" },\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>value</th>\n",
" <th>dt</th>\n",
" <th>ts</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2020-01-01</td>\n",
" <td>2020-01-01 00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2020-01-02</td>\n",
" <td>2020-01-01 00:00:01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>2020-01-03</td>\n",
" <td>2020-01-01 00:00:02</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" value dt ts\n",
"0 1 2020-01-01 2020-01-01 00:00:00\n",
"1 2 2020-01-02 2020-01-01 00:00:01\n",
"2 3 2020-01-03 2020-01-01 00:00:02"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.athena.read_sql_query(\"SELECT * FROM table_date\", database=\"default\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Injected projection"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>value</th>\n",
" <th>uuid</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>761e2488-a078-11ea-bb37-0242ac130002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>b89ed095-8179-4635-9537-88592c0f6bc3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>87adc586-ce88-4f0a-b1c8-bf8e00d32249</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" value uuid\n",
"0 1 761e2488-a078-11ea-bb37-0242ac130002\n",
"1 2 b89ed095-8179-4635-9537-88592c0f6bc3\n",
"2 3 87adc586-ce88-4f0a-b1c8-bf8e00d32249"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(\n",
" {\n",
" \"value\": [1, 2, 3],\n",
" \"uuid\": [\n",
" \"761e2488-a078-11ea-bb37-0242ac130002\",\n",
" \"b89ed095-8179-4635-9537-88592c0f6bc3\",\n",
" \"87adc586-ce88-4f0a-b1c8-bf8e00d32249\",\n",
" ],\n",
" }\n",
")\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"wr.s3.to_parquet(\n",
" df=df,\n",
" path=f\"s3://{bucket}/table_injected/\",\n",
" dataset=True,\n",
" partition_cols=[\"uuid\"],\n",
" database=\"default\",\n",
" table=\"table_injected\",\n",
" athena_partition_projection_settings={\n",
" \"projection_types\": {\n",
" \"uuid\": \"injected\",\n",
" }\n",
" },\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>value</th>\n",
" <th>uuid</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>b89ed095-8179-4635-9537-88592c0f6bc3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" value uuid\n",
"0 2 b89ed095-8179-4635-9537-88592c0f6bc3"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.athena.read_sql_query(\n",
" sql=\"SELECT * FROM table_injected WHERE uuid='b89ed095-8179-4635-9537-88592c0f6bc3'\", database=\"default\"\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Cleaning Up"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"wr.s3.delete_objects(f\"s3://{bucket}/table_integer/\")\n",
"wr.s3.delete_objects(f\"s3://{bucket}/table_enum/\")\n",
"wr.s3.delete_objects(f\"s3://{bucket}/table_date/\")\n",
"wr.s3.delete_objects(f\"s3://{bucket}/table_injected/\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"wr.catalog.delete_table_if_exists(table=\"table_integer\", database=\"default\")\n",
"wr.catalog.delete_table_if_exists(table=\"table_enum\", database=\"default\")\n",
"wr.catalog.delete_table_if_exists(table=\"table_date\", database=\"default\")\n",
"wr.catalog.delete_table_if_exists(table=\"table_injected\", database=\"default\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.14",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.14"
}
},
"nbformat": 4,
"nbformat_minor": 4
}